sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_3.5.1 magrittr_1.5 tools_3.5.1 htmltools_0.3.6
## [5] yaml_2.2.0 Rcpp_1.0.0 stringi_1.2.4 rmarkdown_1.11
## [9] knitr_1.20 stringr_1.3.1 digest_0.6.18 evaluate_0.12
# # https://gist.github.com/smithdanielle/9913897
# check.packages <- function(pkg){
# new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
# if (length(new.pkg))
# install.packages(new.pkg, dependencies = TRUE)
# #sapply(pkg, require, character.only = TRUE)
# sapply(pkg, library, character.only = TRUE) # see comment below in GitHub repo
# }
#
# # Usage example
# packages<-c("ggplot2", "dplyr", "caret", "caTools", "neuralnet", "tictoc", "randomForest", "DT", "e1071", "xgboost")
# check.packages(packages)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DT)
Prepare Data
Read and Clean Features
features = read.csv("../../Data/features.csv")
str(features)
## 'data.frame': 10000 obs. of 241 variables:
## $ JobName: Factor w/ 10000 levels "Job_00001","Job_00002",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ x1 : num 2.073 2.268 1.742 0.787 2.334 ...
## $ x2 : num 4.92 4.96 2.06 2.61 4.3 ...
## $ x3 : num 20 19.1 13.4 17.2 14.6 ...
## $ x4 : num 3.52 19.76 38.83 64.4 52.54 ...
## $ x5 : num 7.86 6.93 6.27 5.38 6.79 ...
## $ x6 : num 1.607 1.362 2.053 0.907 2.461 ...
## $ x7 : num 2.98 2.39 2.04 2.4 2.89 ...
## $ x8 : num 8.54 6.56 10.28 13.49 9.36 ...
## $ x9 : num 1.103 0.589 4.834 3.34 1.246 ...
## $ x10 : num 4.61 1.03 4.39 4.51 1.73 ...
## $ x11 : num 1.05e-07 1.03e-07 1.06e-07 9.47e-08 1.01e-07 1.07e-07 9.89e-08 9.30e-08 9.70e-08 9.47e-08 ...
## $ x12 : num 8 7.49 6.35 9.55 9.6 ...
## $ x13 : num 13.22 22.56 15.05 17.17 5.79 ...
## $ x14 : num 4.38 2.06 3.26 3.09 3.94 ...
## $ x15 : num 0.237 0.564 2.06 1.881 1.582 ...
## $ x16 : num 6.08 6.9 8.42 11.19 7.1 ...
## $ x17 : num 3.99 4.15 4.49 2.13 3.56 ...
## $ x18 : num 4.77 6.85 3.49 5.59 7.77 ...
## $ x19 : num 2.7 9.62 4.72 5.11 1.36 ...
## $ x20 : num 1.04 1.92 1.56 1.49 1.24 ...
## $ x21 : num 42.4 26.6 20.1 32.6 44.6 ...
## $ x22 : num 1.36 4.05 3.08 1.36 1.94 ...
## $ x23 : num 2.7 2.38 4.49 3.4 2.25 ...
## $ stat1 : num 2.38 -1.407 -0.767 0.437 2.449 ...
## $ stat2 : num 0.188 1.814 -0.123 -1.936 -0.617 ...
## $ stat3 : num -1.228 1.62 1.142 0.903 -2.552 ...
## $ stat4 : num -0.6 2.64 2.98 -1.6 -2.15 ...
## $ stat5 : num 0.1489 1.9208 2.4226 -0.0018 -2.3111 ...
## $ stat6 : num -0.662 1.741 -0.417 -0.695 -1.017 ...
## $ stat7 : num -2.485 -1.96 2.221 -0.369 2.727 ...
## $ stat8 : num 0.365 -2.019 -2.674 -0.971 1.542 ...
## $ stat9 : num 2.536 -1.373 0.484 1.796 -1.316 ...
## $ stat10 : num 2.9207 -0.3164 2.7338 0.7477 -0.0977 ...
## $ stat11 : num -2.323 -0.855 -2.182 1.398 0.957 ...
## $ stat12 : num -2.48 1.12 2.87 1.86 2.57 ...
## $ stat13 : num -0.634 0.723 -2.976 -1.038 0.318 ...
## $ stat14 : num -0.365 0.212 2.987 2.334 1.031 ...
## $ stat15 : num -0.532 -0.145 1.954 2.306 0.164 ...
## $ stat16 : num 0.603 -2.036 -1.886 -2.895 -0.661 ...
## $ stat17 : num -1.0452 0.0951 0.4029 2.9745 -0.9847 ...
## $ stat18 : num 2.354 0.473 1.466 2.39 0.69 ...
## $ stat19 : num 2.4 1.89 -1.5 2.31 1.59 ...
## $ stat20 : num 0.263 2.789 2.916 -1.189 -2.12 ...
## $ stat21 : num -0.979 -1.392 -2.389 -2.198 1.796 ...
## $ stat22 : num 1.787 -1.72 2.816 1.367 -0.936 ...
## $ stat23 : num -2.37 -2.33 -2.54 -1.97 2.05 ...
## $ stat24 : num 2.858 1.558 0.142 -1.408 -2.208 ...
## $ stat25 : num -0.472 -1.957 0.357 2.51 -1.928 ...
## $ stat26 : num -2.82 1.55 -1.05 1.68 -2.12 ...
## $ stat27 : num -0.952 -0.508 -2.154 -0.255 1.818 ...
## $ stat28 : num 2.8889 -1.5872 0.0307 -2.9038 -1.4217 ...
## $ stat29 : num 0.799 1.976 -0.446 1.057 0.885 ...
## $ stat30 : num -2.006 -0.387 1.028 2.559 2.277 ...
## $ stat31 : num -0.246 1.357 1.4 -2.983 2.65 ...
## $ stat32 : num 0.648 2.649 -1.018 -1.13 2.305 ...
## $ stat33 : num -2.8746 2.2846 1.4111 0.0547 -2.3915 ...
## $ stat34 : num -0.36 1.86 -2.42 -1.56 -1.83 ...
## $ stat35 : num 2.429 1.371 -0.981 1.097 -1.097 ...
## $ stat36 : num -0.542 -1.371 2.057 -2.282 1.487 ...
## $ stat37 : num -2.678 1.39 0.885 1.885 -2.374 ...
## $ stat38 : num -2.887 1.227 2.057 0.539 -0.374 ...
## $ stat39 : num -0.895 -0.893 1.122 2.733 1.427 ...
## $ stat40 : num 1.175 1.054 1.853 -0.437 1.255 ...
## $ stat41 : num -1.047 2.538 1.148 -1.381 0.226 ...
## $ stat42 : num -1.391 1.648 0.229 -2.79 1.954 ...
## $ stat43 : num 2.5411 0.4413 0.0889 2.383 2.6643 ...
## $ stat44 : num -1.432 -2.505 2.304 0.169 0.803 ...
## $ stat45 : num 0.63 1.273 -0.774 -2.159 -1.552 ...
## $ stat46 : num -2.093 1.725 -0.073 1.608 1.618 ...
## $ stat47 : num -2.832 -0.58 0.792 -1.889 2.109 ...
## $ stat48 : num 2.145 -1.369 1.571 0.568 -2.72 ...
## $ stat49 : num 0.567 1.491 1.104 -0.702 2.196 ...
## $ stat50 : num 0.154 1.247 -0.255 -0.397 -0.262 ...
## $ stat51 : num 0.629 0.89 -2.166 0.158 1.211 ...
## $ stat52 : num 2.22 -2.602 0.266 2.177 0.826 ...
## $ stat53 : num 2.18 -2.11 1.23 2.54 -2.46 ...
## $ stat54 : num 0.555 1.386 2.134 -2.139 2.163 ...
## $ stat55 : num -2.197 0.0878 1.6523 0.1286 0.6044 ...
## $ stat56 : num -0.288 2 -0.439 -1.991 2.545 ...
## $ stat57 : num 1.323 0.801 -0.181 0.963 -1.498 ...
## $ stat58 : num -1.33 -0.27 2.11 1.65 2.61 ...
## $ stat59 : num 1.2424 0.0638 0.9322 -0.2984 -1.1761 ...
## $ stat60 : num -2.58 0.947 2.46 0.727 -1.795 ...
## $ stat61 : num 1.328 1.117 0.465 -2.313 -2.669 ...
## $ stat62 : num 1.6856 0.0313 -1.7103 -1.477 0.1781 ...
## $ stat63 : num 0.628 -2.194 -0.516 2.591 2.896 ...
## $ stat64 : num -1.68 0.338 1.828 -1.513 2.941 ...
## $ stat65 : num -2.949 -1.117 -0.223 -0.352 -2.165 ...
## $ stat66 : num -0.333 -1.573 -0.45 -2.072 1.2 ...
## $ stat67 : num 1.575 -2.923 0.793 0.944 2.827 ...
## $ stat68 : num -2.298 0.266 -1.245 2.921 0.746 ...
## $ stat69 : num 1.55 -1.96 -2.23 0.51 1.68 ...
## $ stat70 : num -1.35 2.51 2.31 -2.44 -1.28 ...
## $ stat71 : num 1.026 0.353 -2.18 -2.405 1.354 ...
## $ stat72 : num 2.107 1.692 -2.265 2.088 -0.809 ...
## $ stat73 : num 2.663 -1.217 0.142 -0.863 -0.512 ...
## $ stat74 : num -2.892 -1.727 0.989 0.401 -2.17 ...
## $ stat75 : num -0.0213 2.2118 1.9559 -1.1699 1.0734 ...
## [list output truncated]
Checking correlations to evaluate removal of redundant features
corr.matrix = round(cor(features[sapply(features, is.numeric)]),2)
# filter out only highly correlated variables
threshold = 0.6
corr.matrix.tmp = corr.matrix
diag(corr.matrix.tmp) = 0
high.corr = apply(abs(corr.matrix.tmp) >= threshold, 1, any)
high.corr.matrix = corr.matrix.tmp[high.corr, high.corr]
DT::datatable(corr.matrix)
DT::datatable(high.corr.matrix)
Clean Column Names
Feature Names
feature.names = colnames(features)
drops <- c('JobName')
feature.names = feature.names[!(feature.names %in% drops)]
str(feature.names)
## chr [1:240] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10" "x11" ...
Read and Clean Labels
labels = read.csv("../../Data/labels.csv")
#str(labels)
labels = labels[,c("JobName", output.var)]
summary(labels)
## JobName y3
## Job_00001: 1 Min. : 95.91
## Job_00002: 1 1st Qu.:118.21
## Job_00003: 1 Median :123.99
## Job_00004: 1 Mean :125.36
## Job_00005: 1 3rd Qu.:131.06
## Job_00006: 1 Max. :193.73
## (Other) :9994 NA's :2497
Clean Column Names
Merge Datasets
data <- merge(features, labels, by = 'JobName')
drops <- c('JobName')
data = data[,(!colnames(data) %in% drops)]
str(data)
## 'data.frame': 10000 obs. of 241 variables:
## $ x1 : num 2.073 2.268 1.742 0.787 2.334 ...
## $ x2 : num 4.92 4.96 2.06 2.61 4.3 ...
## $ x3 : num 20 19.1 13.4 17.2 14.6 ...
## $ x4 : num 3.52 19.76 38.83 64.4 52.54 ...
## $ x5 : num 7.86 6.93 6.27 5.38 6.79 ...
## $ x6 : num 1.607 1.362 2.053 0.907 2.461 ...
## $ x7 : num 2.98 2.39 2.04 2.4 2.89 ...
## $ x8 : num 8.54 6.56 10.28 13.49 9.36 ...
## $ x9 : num 1.103 0.589 4.834 3.34 1.246 ...
## $ x10 : num 4.61 1.03 4.39 4.51 1.73 ...
## $ x11 : num 1.05e-07 1.03e-07 1.06e-07 9.47e-08 1.01e-07 1.07e-07 9.89e-08 9.30e-08 9.70e-08 9.47e-08 ...
## $ x12 : num 8 7.49 6.35 9.55 9.6 ...
## $ x13 : num 13.22 22.56 15.05 17.17 5.79 ...
## $ x14 : num 4.38 2.06 3.26 3.09 3.94 ...
## $ x15 : num 0.237 0.564 2.06 1.881 1.582 ...
## $ x16 : num 6.08 6.9 8.42 11.19 7.1 ...
## $ x17 : num 3.99 4.15 4.49 2.13 3.56 ...
## $ x18 : num 4.77 6.85 3.49 5.59 7.77 ...
## $ x19 : num 2.7 9.62 4.72 5.11 1.36 ...
## $ x20 : num 1.04 1.92 1.56 1.49 1.24 ...
## $ x21 : num 42.4 26.6 20.1 32.6 44.6 ...
## $ x22 : num 1.36 4.05 3.08 1.36 1.94 ...
## $ x23 : num 2.7 2.38 4.49 3.4 2.25 ...
## $ stat1 : num 2.38 -1.407 -0.767 0.437 2.449 ...
## $ stat2 : num 0.188 1.814 -0.123 -1.936 -0.617 ...
## $ stat3 : num -1.228 1.62 1.142 0.903 -2.552 ...
## $ stat4 : num -0.6 2.64 2.98 -1.6 -2.15 ...
## $ stat5 : num 0.1489 1.9208 2.4226 -0.0018 -2.3111 ...
## $ stat6 : num -0.662 1.741 -0.417 -0.695 -1.017 ...
## $ stat7 : num -2.485 -1.96 2.221 -0.369 2.727 ...
## $ stat8 : num 0.365 -2.019 -2.674 -0.971 1.542 ...
## $ stat9 : num 2.536 -1.373 0.484 1.796 -1.316 ...
## $ stat10 : num 2.9207 -0.3164 2.7338 0.7477 -0.0977 ...
## $ stat11 : num -2.323 -0.855 -2.182 1.398 0.957 ...
## $ stat12 : num -2.48 1.12 2.87 1.86 2.57 ...
## $ stat13 : num -0.634 0.723 -2.976 -1.038 0.318 ...
## $ stat14 : num -0.365 0.212 2.987 2.334 1.031 ...
## $ stat15 : num -0.532 -0.145 1.954 2.306 0.164 ...
## $ stat16 : num 0.603 -2.036 -1.886 -2.895 -0.661 ...
## $ stat17 : num -1.0452 0.0951 0.4029 2.9745 -0.9847 ...
## $ stat18 : num 2.354 0.473 1.466 2.39 0.69 ...
## $ stat19 : num 2.4 1.89 -1.5 2.31 1.59 ...
## $ stat20 : num 0.263 2.789 2.916 -1.189 -2.12 ...
## $ stat21 : num -0.979 -1.392 -2.389 -2.198 1.796 ...
## $ stat22 : num 1.787 -1.72 2.816 1.367 -0.936 ...
## $ stat23 : num -2.37 -2.33 -2.54 -1.97 2.05 ...
## $ stat24 : num 2.858 1.558 0.142 -1.408 -2.208 ...
## $ stat25 : num -0.472 -1.957 0.357 2.51 -1.928 ...
## $ stat26 : num -2.82 1.55 -1.05 1.68 -2.12 ...
## $ stat27 : num -0.952 -0.508 -2.154 -0.255 1.818 ...
## $ stat28 : num 2.8889 -1.5872 0.0307 -2.9038 -1.4217 ...
## $ stat29 : num 0.799 1.976 -0.446 1.057 0.885 ...
## $ stat30 : num -2.006 -0.387 1.028 2.559 2.277 ...
## $ stat31 : num -0.246 1.357 1.4 -2.983 2.65 ...
## $ stat32 : num 0.648 2.649 -1.018 -1.13 2.305 ...
## $ stat33 : num -2.8746 2.2846 1.4111 0.0547 -2.3915 ...
## $ stat34 : num -0.36 1.86 -2.42 -1.56 -1.83 ...
## $ stat35 : num 2.429 1.371 -0.981 1.097 -1.097 ...
## $ stat36 : num -0.542 -1.371 2.057 -2.282 1.487 ...
## $ stat37 : num -2.678 1.39 0.885 1.885 -2.374 ...
## $ stat38 : num -2.887 1.227 2.057 0.539 -0.374 ...
## $ stat39 : num -0.895 -0.893 1.122 2.733 1.427 ...
## $ stat40 : num 1.175 1.054 1.853 -0.437 1.255 ...
## $ stat41 : num -1.047 2.538 1.148 -1.381 0.226 ...
## $ stat42 : num -1.391 1.648 0.229 -2.79 1.954 ...
## $ stat43 : num 2.5411 0.4413 0.0889 2.383 2.6643 ...
## $ stat44 : num -1.432 -2.505 2.304 0.169 0.803 ...
## $ stat45 : num 0.63 1.273 -0.774 -2.159 -1.552 ...
## $ stat46 : num -2.093 1.725 -0.073 1.608 1.618 ...
## $ stat47 : num -2.832 -0.58 0.792 -1.889 2.109 ...
## $ stat48 : num 2.145 -1.369 1.571 0.568 -2.72 ...
## $ stat49 : num 0.567 1.491 1.104 -0.702 2.196 ...
## $ stat50 : num 0.154 1.247 -0.255 -0.397 -0.262 ...
## $ stat51 : num 0.629 0.89 -2.166 0.158 1.211 ...
## $ stat52 : num 2.22 -2.602 0.266 2.177 0.826 ...
## $ stat53 : num 2.18 -2.11 1.23 2.54 -2.46 ...
## $ stat54 : num 0.555 1.386 2.134 -2.139 2.163 ...
## $ stat55 : num -2.197 0.0878 1.6523 0.1286 0.6044 ...
## $ stat56 : num -0.288 2 -0.439 -1.991 2.545 ...
## $ stat57 : num 1.323 0.801 -0.181 0.963 -1.498 ...
## $ stat58 : num -1.33 -0.27 2.11 1.65 2.61 ...
## $ stat59 : num 1.2424 0.0638 0.9322 -0.2984 -1.1761 ...
## $ stat60 : num -2.58 0.947 2.46 0.727 -1.795 ...
## $ stat61 : num 1.328 1.117 0.465 -2.313 -2.669 ...
## $ stat62 : num 1.6856 0.0313 -1.7103 -1.477 0.1781 ...
## $ stat63 : num 0.628 -2.194 -0.516 2.591 2.896 ...
## $ stat64 : num -1.68 0.338 1.828 -1.513 2.941 ...
## $ stat65 : num -2.949 -1.117 -0.223 -0.352 -2.165 ...
## $ stat66 : num -0.333 -1.573 -0.45 -2.072 1.2 ...
## $ stat67 : num 1.575 -2.923 0.793 0.944 2.827 ...
## $ stat68 : num -2.298 0.266 -1.245 2.921 0.746 ...
## $ stat69 : num 1.55 -1.96 -2.23 0.51 1.68 ...
## $ stat70 : num -1.35 2.51 2.31 -2.44 -1.28 ...
## $ stat71 : num 1.026 0.353 -2.18 -2.405 1.354 ...
## $ stat72 : num 2.107 1.692 -2.265 2.088 -0.809 ...
## $ stat73 : num 2.663 -1.217 0.142 -0.863 -0.512 ...
## $ stat74 : num -2.892 -1.727 0.989 0.401 -2.17 ...
## $ stat75 : num -0.0213 2.2118 1.9559 -1.1699 1.0734 ...
## $ stat76 : num -2.506 1.933 0.295 -1.239 2.67 ...
## [list output truncated]
Exploratory Data Analysis
Scatterplots
panel.hist <- function(x, ...)
{
usr <- par("usr"); on.exit(par(usr))
par(usr = c(usr[1:2], 0, 1.5) )
h <- hist(x, plot = FALSE)
breaks <- h$breaks; nB <- length(breaks)
y <- h$counts; y <- y/max(y)
rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
if (eda == TRUE){
hist(data[complete.cases(data),label.names])
#hist(data[complete.cases(data),alt.scale.label.name])
}

# https://stackoverflow.com/questions/24648729/plot-one-numeric-variable-against-n-numeric-variables-in-n-plots
ind.pairs.plot <- function(data, xvars=NULL, yvar)
{
df <- data
if (is.null(xvars)) {
xvars = names(data[which(names(data)!=yvar)])
}
# if (length(xvars) > 25) {
# print("Warning: number of variables to be plotted exceeds 25, only first 25 will be plotted")
# xvars = xvars[1:25]
# }
#choose a format to display charts
ncharts <- length(xvars)
# nrows = ceiling(sqrt(ncharts))
# ncols = ceiling(ncharts/nrows)
# par(mfrow = c(nrows,ncols))
for(i in 1:ncharts){
plot(df[,xvars[i]],df[,yvar], xlab = xvars[i], ylab = yvar)
}
}
ind.pairs.plot(data, feature.names, label.names)
















































































































































































































































Feature Engineering
# x18 may need transformations
plot(data[,'x18'], data[,label.names], main = "Original Scatter Plot vs. x18", ylab = label.names, xlab = 'x18')

plot(sqrt(data[,'x18']), data[,label.names], main = "Original Scatter Plot vs. sqrt(x18)", ylab = label.names, xlab = 'sqrt(x18)')

plot((data[,'x18'])^2, data[,label.names], main = "Original Scatter Plot vs. square(x18)", ylab = label.names, xlab = 'x18**2')
